Just run these once, following importing tlg and phi5, then processing with tlgu.
In [2]:
import os
import re
import time
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from greek_accentuation.characters import base # pip install greek-accentuation
In [6]:
# Import the Greek models dir, which contains (among other things) the lemmatizer mappings
from cltk.corpus.utils.importer import CorpusImporter
corpus_importer = CorpusImporter('greek')
corpus_importer.import_corpus('greek_models_cltk')
In [7]:
# This takes ~25 mins on a good server
# make working dir
user_dir = os.path.expanduser('~/cltk_data/user_data/tlg_lemmatized_no_accents_no_stops')
if not os.path.isdir(user_dir):
os.makedirs(user_dir, exist_ok=True)
# rm numbers (eg, αʹβʹ)
comp_numbers = re.compile(r'.ʹ+?')
# load lemmatizer map into memory
# http://docs.cltk.org/en/latest/greek.html#lemmatization
lemmatizer = LemmaReplacer('greek')
# get filepaths
# http://docs.cltk.org/en/latest/greek.html#tlg-indices
filepaths = assemble_tlg_author_filepaths()
# open each original file, clean, lemmatize, and write into new file
for filepath in filepaths:
t0 = time.time()
# open original
with open(filepath) as fo:
text = fo.read()
# cleanup tlg texts
# http://docs.cltk.org/en/latest/greek.html#text-cleanup
text_cleaned = tlg_plaintext_cleanup(text, rm_punctuation=True, rm_periods=True)
# rm numbers
text_cleaned = comp_numbers.sub('', text_cleaned)
# do lemmatization
text_cleaned = text_cleaned.lower()
tokens = lemmatizer.lemmatize(text_cleaned, return_string=False)
# rm stops
# http://docs.cltk.org/en/latest/greek.html#stopword-filtering
tokens = [w for w in tokens if not w in greek_stops]
# rm words less than 3 chars
tokens = [w for w in tokens if len(w) > 2]
#TODO: rm accents
tokens_no_accents = []
for word in tokens:
word_no_accents = []
word = ''.join([base(char) for char in word])
tokens_no_accents.append(word)
#TODO: ''.join()
text_no_accents = ' '.join(tokens_no_accents)
# write file
file_name = os.path.split(filepath)[1]
lemmatized_fp = os.path.join(user_dir, file_name)
with open(lemmatized_fp, 'w') as fo:
fo.write(text_no_accents)
# print('Processing time for {0}: {1} secs.'.format(filepath, time.time() - t0))
In [8]:
# now see the texts lemmatized and with all markup, and stopwords, removed
with open('/root/cltk_data/user_data/tlg_lemmatized_no_accents_no_stops/TLG0007.TXT') as file_open:
text_snippet = file_open.read()[:1500]
print(text_snippet)
In [ ]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
In [ ]:
# make working dir
user_dir = os.path.expanduser('~/cltk_data/user_data/phi5_lemmatized')
if not os.path.isdir(user_dir):
os.makedirs(user_dir, exist_ok=True)
# load lemmatizer map into memory
# http://docs.cltk.org/en/latest/latin.html#lemmatization
lemmatizer = LemmaReplacer('latin')
# get filepaths
# http://docs.cltk.org/en/latest/latin.html#phi-indices
filepaths = assemble_phi5_author_filepaths()
# open each original file, clean, lemmatize, and write into new file
for filepath in filepaths:
# open original
with open(filepath) as fo:
text = fo.read()
# cleanup phi5 texts
# http://docs.cltk.org/en/latest/latin.html#text-cleanup
text_cleaned = phi5_plaintext_cleanup(text, rm_punctuation=True, rm_periods=False)
# do lemmatization
text_lemmatized = lemmatizer.lemmatize(text_cleaned, return_string=True)
# write file
file_name = os.path.split(filepath)[1]
lemmatized_fp = os.path.join(user_dir, file_name)
with open(lemmatized_fp, 'w') as fo:
fo.write(text_lemmatized)
In [ ]: